Packages¶

In [1]:
#dependencies and setup
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns 
import plotly.express as px
from openpyxl import Workbook
import numpy as np
import requests
import json
import re
from pprint import pprint
import time
# SQLite dependencies
import sqlite3
from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, String, Float
from pandas_profiling import ProfileReport

DB connection¶

In [2]:
#  SQLite DB creation and establishing connection
database_path = "NJ_County_DB.sqlite"
engine = create_engine(f"sqlite:///{database_path}", echo=True)
sqlite_connection = engine.connect()

All tables¶

In [3]:
sql_query = """SELECT name FROM sqlite_master  
  WHERE type='table';"""
tbls= pd.read_sql(sql_query,sqlite_connection)
tbls
2023-03-27 13:59:15,441 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT name FROM sqlite_master  
  WHERE type='table';")
2023-03-27 13:59:15,445 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,447 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT name FROM sqlite_master  
  WHERE type='table';")
2023-03-27 13:59:15,448 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,450 INFO sqlalchemy.engine.Engine SELECT name FROM sqlite_master  
  WHERE type='table';
2023-03-27 13:59:15,452 INFO sqlalchemy.engine.Engine [raw sql] ()
Out[3]:
name
0 nj_mortgage_rates
1 nj_population
2 nj_zillow_house_value_index
3 nj_food_desert
4 nj_crime_detail
5 nj_school_performance
6 nj_zillow_observed_rent_index
7 nj_adi
8 nj_counties_dist_to_major_cities
9 nj_poverty_median_income
10 nj_property_tax

We have 11 datasets.

In [4]:
row_count=[]
max_year=[]
min_year=[]
for i in tbls['name'].tolist():
    try:
        sql_query = f"""SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM {i};"""
        df = pd.read_sql(sql_query,sqlite_connection)
        row_count.append(df['count'].values[0])
        min_year.append(df['min_year'].values[0])
        max_year.append(df['max_year'].values[0])
    except:
        sql_query = f"""SELECT count(*) as count FROM {i};"""
        df = pd.read_sql(sql_query,sqlite_connection)
        row_count.append(df['count'].values[0])
        min_year.append('NaN')
        max_year.append('NaN')
tbls['row_count'] = row_count
tbls['min_year'] = min_year
tbls['max_year'] = max_year
tbls
2023-03-27 13:59:15,501 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_mortgage_rates;")
2023-03-27 13:59:15,504 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,508 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_mortgage_rates;")
2023-03-27 13:59:15,510 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,514 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_mortgage_rates;
2023-03-27 13:59:15,515 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,523 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_population;")
2023-03-27 13:59:15,524 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,527 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_population;")
2023-03-27 13:59:15,530 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,533 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_population;
2023-03-27 13:59:15,535 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,543 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_zillow_house_value_index;")
2023-03-27 13:59:15,545 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,547 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_zillow_house_value_index;")
2023-03-27 13:59:15,551 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,554 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_zillow_house_value_index;
2023-03-27 13:59:15,556 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,563 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_food_desert;")
2023-03-27 13:59:15,565 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,568 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_food_desert;")
2023-03-27 13:59:15,569 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,572 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_food_desert;
2023-03-27 13:59:15,573 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,575 INFO sqlalchemy.engine.Engine ROLLBACK
2023-03-27 13:59:15,576 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count FROM nj_food_desert;")
2023-03-27 13:59:15,578 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,581 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count FROM nj_food_desert;")
2023-03-27 13:59:15,582 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,584 INFO sqlalchemy.engine.Engine SELECT count(*) as count FROM nj_food_desert;
2023-03-27 13:59:15,586 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,593 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_crime_detail;")
2023-03-27 13:59:15,595 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,597 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_crime_detail;")
2023-03-27 13:59:15,598 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,601 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_crime_detail;
2023-03-27 13:59:15,602 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,614 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_school_performance;")
2023-03-27 13:59:15,616 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,618 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_school_performance;")
2023-03-27 13:59:15,620 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,623 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_school_performance;
2023-03-27 13:59:15,625 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,635 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_zillow_observed_rent_index;")
2023-03-27 13:59:15,637 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,639 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_zillow_observed_rent_index;")
2023-03-27 13:59:15,640 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,642 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_zillow_observed_rent_index;
2023-03-27 13:59:15,643 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,648 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_adi;")
2023-03-27 13:59:15,650 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,654 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_adi;")
2023-03-27 13:59:15,656 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,658 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_adi;
2023-03-27 13:59:15,659 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,661 INFO sqlalchemy.engine.Engine ROLLBACK
2023-03-27 13:59:15,663 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count FROM nj_adi;")
2023-03-27 13:59:15,664 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,669 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count FROM nj_adi;")
2023-03-27 13:59:15,672 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,674 INFO sqlalchemy.engine.Engine SELECT count(*) as count FROM nj_adi;
2023-03-27 13:59:15,678 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,683 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_counties_dist_to_major_cities;")
2023-03-27 13:59:15,687 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,689 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_counties_dist_to_major_cities;")
2023-03-27 13:59:15,691 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,693 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_counties_dist_to_major_cities;
2023-03-27 13:59:15,695 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,697 INFO sqlalchemy.engine.Engine ROLLBACK
2023-03-27 13:59:15,699 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count FROM nj_counties_dist_to_major_cities;")
2023-03-27 13:59:15,701 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,708 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count FROM nj_counties_dist_to_major_cities;")
2023-03-27 13:59:15,710 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,712 INFO sqlalchemy.engine.Engine SELECT count(*) as count FROM nj_counties_dist_to_major_cities;
2023-03-27 13:59:15,714 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,720 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_poverty_median_income;")
2023-03-27 13:59:15,722 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,724 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_poverty_median_income;")
2023-03-27 13:59:15,726 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,730 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_poverty_median_income;
2023-03-27 13:59:15,731 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,737 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_property_tax;")
2023-03-27 13:59:15,739 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,742 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_property_tax;")
2023-03-27 13:59:15,744 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,747 INFO sqlalchemy.engine.Engine SELECT count(*) as count, MIN(year) as min_year, MAX(year) as max_year FROM nj_property_tax;
2023-03-27 13:59:15,748 INFO sqlalchemy.engine.Engine [raw sql] ()
Out[4]:
name row_count min_year max_year
0 nj_mortgage_rates 32 1991 2022
1 nj_population 252 2010 2021
2 nj_zillow_house_value_index 2520 2000 2023
3 nj_food_desert 2002 NaN NaN
4 nj_crime_detail 9646 2017 2020
5 nj_school_performance 8888 2017 2020
6 nj_zillow_observed_rent_index 180 2015 2023
7 nj_adi 6437 NaN NaN
8 nj_counties_dist_to_major_cities 21 NaN NaN
9 nj_poverty_median_income 462 2000 2021
10 nj_property_tax 12972 2000 2022

Most of the datasets are historical.

Crime¶

In [5]:
sql_query = """SELECT * FROM nj_crime_detail;"""
crime_df = pd.read_sql(sql_query,sqlite_connection)
crime_df.info()
2023-03-27 13:59:15,814 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_crime_detail;")
2023-03-27 13:59:15,816 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,825 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_crime_detail;")
2023-03-27 13:59:15,827 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:15,831 INFO sqlalchemy.engine.Engine SELECT * FROM nj_crime_detail;
2023-03-27 13:59:15,834 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9646 entries, 0 to 9645
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   county_name  9646 non-null   object 
 1   year         9646 non-null   int64  
 2   agency       9646 non-null   object 
 3   report_type  9646 non-null   object 
 4   population   9521 non-null   float64
 5   murder       9646 non-null   float64
 6   rape         9646 non-null   float64
 7   robbery      9646 non-null   float64
 8   assault      9646 non-null   float64
 9   burglary     9646 non-null   float64
 10  larceny      9646 non-null   float64
 11  auto_theft   9646 non-null   float64
 12  total        9646 non-null   float64
dtypes: float64(9), int64(1), object(3)
memory usage: 979.8+ KB
In [6]:
# five number summary
crime_df.describe()
Out[6]:
year population murder rape robbery assault burglary larceny auto_theft total
count 9,646.00 9,521.00 9,646.00 9,646.00 9,646.00 9,646.00 9,646.00 9,646.00 9,646.00 9,646.00
mean 2,018.80 15,751.89 2.36 5.75 15.05 35.78 51.62 333.61 25.58 446.59
std 0.98 24,654.68 13.93 17.90 74.20 218.42 128.47 3,398.15 123.21 3,594.00
min 2,017.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
25% 2,018.00 2,934.00 0.00 0.00 0.00 0.00 0.00 5.00 0.00 10.00
50% 2,019.00 8,271.00 0.00 0.00 0.00 5.00 7.00 25.00 1.00 36.00
75% 2,020.00 18,874.00 0.00 1.00 8.00 40.77 38.00 198.15 17.00 292.00
max 2,020.00 283,673.00 232.80 300.00 5,797.10 20,000.00 2,068.30 194,117.60 7,352.90 202,941.20
In [7]:
df = crime_df[crime_df['report_type']=='Rate Per 100,000'].groupby(['county_name', 'year'],\
                                                                             as_index=False).mean()
# crime by county trend
fig = px.line(df, x="year", y = 'total', title='Crime by County',\
              color='county_name', markers=True)
fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [2017,2018,2019,2020],
        ticktext = [2017,2018,2019,2020]
    ),
        yaxis_title="Crime Per 100k population"
)
fig.show()

Crime rate is going down in most of the counties across years, except Cape May.

In [8]:
# crime by type trend
crime_type=df.groupby('year',as_index=False).mean()
fig = px.line(crime_type, x="year", y = crime_type.columns[2:-1], title='Crime by types', markers=True)
fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [2017,2018,2019,2020],
        ticktext = [2017,2018,2019,2020]
    ),
        yaxis_title="Crime Per 100k population"
)
fig.show()

Most frequent violent crime in NJ is larceny which is also going down along with all other violent crimes over time.

Poverty and Median Income¶

In [9]:
sql_query = """SELECT * FROM nj_poverty_median_income;"""
pov_mhi_df = pd.read_sql(sql_query,sqlite_connection)
pov_mhi_df.info()
2023-03-27 13:59:18,989 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_poverty_median_income;")
2023-03-27 13:59:18,990 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:18,992 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_poverty_median_income;")
2023-03-27 13:59:18,994 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:18,996 INFO sqlalchemy.engine.Engine SELECT * FROM nj_poverty_median_income;
2023-03-27 13:59:18,997 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462 entries, 0 to 461
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   county_name       462 non-null    object 
 1   median_hh_income  462 non-null    int64  
 2   poverty_count     462 non-null    int64  
 3   poverty_rate      462 non-null    float64
 4   st_abb            462 non-null    object 
 5   year              462 non-null    int64  
 6   state_code        462 non-null    object 
 7   county_code       462 non-null    object 
dtypes: float64(1), int64(3), object(4)
memory usage: 29.0+ KB
In [53]:
# five number summary
pov_mhi_df.describe()
Out[53]:
median_hh_income poverty_count poverty_rate year
count 462.00 462.00 462.00 462.00
mean 69,642.02 39,028.81 9.24 2,010.50
std 18,676.75 30,750.71 3.96 6.35
min 36,574.00 3,178.00 2.50 2,000.00
25% 54,819.75 12,870.75 5.90 2,005.00
50% 66,456.00 29,364.50 8.50 2,010.50
75% 80,453.75 58,289.75 11.90 2,016.00
max 123,708.00 136,161.00 20.10 2,021.00
In [39]:
# trend
fig = px.line(pov_mhi_df, x="year", y="poverty_rate", title='Poverty', color='county_name', markers=True)
fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [x for x in range(2000,2022)],
        ticktext = [x for x in range(2000,2022)]
    )
)
fig.show()

Poverty rate is going up and down since 2000 through 2021. But unltimately poverty rate in 2021 is more than what it was back in 2000.

In [40]:
# trend
fig = px.line(pov_mhi_df, x="year", y="median_hh_income", title='Median Income', color='county_name', markers=True)
fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [x for x in range(2000,2022)],
        ticktext = [x for x in range(2000,2022)]
    )
)
fig.show()

Median income is steadily going up across time.

School Performance¶

In [12]:
sql_query = """SELECT * FROM nj_school_performance;"""
school_df = pd.read_sql(sql_query,sqlite_connection)
school_df.info()
2023-03-27 13:59:19,643 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_school_performance;")
2023-03-27 13:59:19,645 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:19,648 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_school_performance;")
2023-03-27 13:59:19,652 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:19,656 INFO sqlalchemy.engine.Engine SELECT * FROM nj_school_performance;
2023-03-27 13:59:19,659 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8888 entries, 0 to 8887
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   rank            8888 non-null   int64  
 1   school          8888 non-null   object 
 2   grades          8888 non-null   object 
 3   district        8888 non-null   object 
 4   students        8888 non-null   int64  
 5   free_lunch_rec  8863 non-null   float64
 6   total_exp       8790 non-null   float64
 7   score           8888 non-null   float64
 8   year            8888 non-null   int64  
 9   school_type     8888 non-null   object 
 10  type            8888 non-null   object 
 11  address         8888 non-null   object 
 12  city            8888 non-null   object 
 13  zip             8888 non-null   object 
 14  county_name     8888 non-null   object 
dtypes: float64(3), int64(3), object(9)
memory usage: 1.0+ MB
In [51]:
# five number summary
school_df.describe()
Out[51]:
rank students free_lunch_rec total_exp score year
count 8,888.00 8,888.00 8,863.00 8,790.00 8,888.00 8,888.00
mean 505.87 593.56 37.71 11,537.00 49.73 2,018.42
std 369.59 450.28 29.97 3,494.85 27.08 1.12
min 1.00 0.00 0.00 337.00 1.20 2,017.00
25% 198.00 333.00 10.50 9,233.00 26.40 2,017.00
50% 403.50 471.00 30.90 11,210.00 49.30 2,018.00
75% 766.25 708.00 63.40 13,433.00 73.90 2,019.00
max 1,336.00 6,298.00 100.00 33,152.00 99.20 2,020.00
In [14]:
# trend - County
df = school_df.groupby(['year', 'county_name']).agg(MeanScore=('score', 'mean')).reset_index()
df['year'] = pd.to_datetime(df['year'], format='%Y')
# trend
fig = px.line(df, x="year", y="MeanScore", title='School Scores by County', color='county_name', markers=True)
fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [2017,2018,2019,2020],
        ticktext = [2017,2018,2019,2020]
    )
)
fig.show()

On average, NJ schools maintained their quality/ranking over time with a couple of exceptions like Passiac, Sussex and Bergen.

In [15]:
# trend - Shcool types/ phases
df = school_df.groupby(['year', 'school_type']).agg(MeanScore=('score', 'mean')).reset_index()
df['year'] = pd.to_datetime(df['year'], format='%Y')
# trend
fig = px.line(df, x="year", y="MeanScore", title='School Scores by School Types', color='school_type', markers=True)
fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [2017,2018,2019,2020],
        ticktext = [2017,2018,2019,2020]
    )
)
fig.show()

Mean scores for elementary schools went down. Middle schools saw a steep decline from 2018-2019 then got back up in 2020. High school scores went up by a lot since 2019 onwards.

In [16]:
# trend - type
df = school_df.groupby(['year', 'type']).agg(MeanScore=('score', 'mean')).reset_index()
df['year'] = pd.to_datetime(df['year'], format='%Y')
# trend
fig = px.line(df, x="year", y="MeanScore", title='School Scores by Type', color='type', markers=True)
fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = [2017,2018,2019,2020],
        ticktext = [2017,2018,2019,2020]
    )
)
fig.show()

Mean scores for Public,Alternative schools went down. Public, Charter, Alternative schools scores saw a decline from 2018. Public school maintained their scores and Public, Charter schools scores decreased a little over time.

Zillow House Price Index¶

In [17]:
sql_query = """SELECT * FROM nj_zillow_house_value_index;"""
zhvi_df = pd.read_sql(sql_query,sqlite_connection)
zhvi_df.info()
2023-03-27 13:59:20,520 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_zillow_house_value_index;")
2023-03-27 13:59:20,522 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:20,525 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_zillow_house_value_index;")
2023-03-27 13:59:20,527 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:20,528 INFO sqlalchemy.engine.Engine SELECT * FROM nj_zillow_house_value_index;
2023-03-27 13:59:20,530 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2520 entries, 0 to 2519
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   county_name        2520 non-null   object 
 1   year               2520 non-null   int64  
 2   num_of_bedrooms    2520 non-null   int64  
 3   house_value_index  2500 non-null   float64
dtypes: float64(1), int64(2), object(1)
memory usage: 78.9+ KB
In [52]:
# five number summary
zhvi_df.describe()
Out[52]:
year num_of_bedrooms house_value_index
count 2,520.00 2,520.00 2,500.00
mean 2,011.50 3.00 300,644.65
std 6.92 1.41 180,913.97
min 2,000.00 1.00 24,948.62
25% 2,005.75 2.00 169,532.93
50% 2,011.50 3.00 258,947.71
75% 2,017.25 4.00 388,660.31
max 2,023.00 5.00 1,620,305.39
In [19]:
# trend
df = zhvi_df.groupby(['year','num_of_bedrooms'],as_index=False).agg({'house_value_index':'mean'})
fig = px.line(df, x="year", y = 'house_value_index', color='num_of_bedrooms',\
              title='Price by bedrooms in NJ', markers=True)
fig.show()

House prices for all bedrooms are going up over time except for a short decline from 2007 to 2012.

In [20]:
# trend
df = zhvi_df.groupby(['year','county_name'],as_index=False).agg({'house_value_index':'mean'})
fig = px.line(df, x="year", y = 'house_value_index', color='county_name',\
              title='Price by county in NJ', markers=True)
fig.show()

House prices for all counties are going up over time except for a short decline from 2007 to 2012.

Zillow Observed Rent Index¶

In [21]:
sql_query = """SELECT * FROM nj_zillow_observed_rent_index;"""
zori_df = pd.read_sql(sql_query,sqlite_connection)
zori_df.info()
2023-03-27 13:59:21,099 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_zillow_observed_rent_index;")
2023-03-27 13:59:21,105 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:21,112 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_zillow_observed_rent_index;")
2023-03-27 13:59:21,114 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:21,116 INFO sqlalchemy.engine.Engine SELECT * FROM nj_zillow_observed_rent_index;
2023-03-27 13:59:21,118 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   county_name          180 non-null    object 
 1   year                 180 non-null    int64  
 2   observed_rent_index  157 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 4.3+ KB
In [54]:
# five number summary
zori_df.describe()
Out[54]:
year observed_rent_index
count 180.00 157.00
mean 2,019.00 1,876.35
std 2.59 384.72
min 2,015.00 1,185.20
25% 2,017.00 1,576.68
50% 2,019.00 1,850.26
75% 2,021.00 2,147.89
max 2,023.00 2,855.24
In [22]:
# trend
fig = px.line(zori_df, x="year", y = 'observed_rent_index', color='county_name',\
              title='Rent Price by county in NJ', markers=True)
fig.show()

Rental prices for all counties are going up over time.

Property Tax¶

In [23]:
sql_query = """SELECT * FROM nj_property_tax;"""
tax_df = pd.read_sql(sql_query,sqlite_connection)
tax_df.info()
2023-03-27 13:59:21,361 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_property_tax;")
2023-03-27 13:59:21,364 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:21,367 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_property_tax;")
2023-03-27 13:59:21,369 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:21,374 INFO sqlalchemy.engine.Engine SELECT * FROM nj_property_tax;
2023-03-27 13:59:21,376 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12972 entries, 0 to 12971
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   county_code    12972 non-null  object 
 1   county_name    12972 non-null  object 
 2   district_code  12972 non-null  object 
 3   district_name  12972 non-null  object 
 4   year           12972 non-null  int64  
 5   tax_rate       12972 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 608.2+ KB
In [106]:
# five number summary
tax_df.describe()
Out[106]:
year tax_rate
count 12,972.00 12,972.00
mean 2,011.00 3.23
std 6.63 2.31
min 2,000.00 0.00
25% 2,005.00 2.16
50% 2,011.00 2.82
75% 2,017.00 3.63
max 2,022.00 37.77
In [25]:
# trend
df = tax_df.groupby(['year', 'county_name']).agg(tax_rate=('tax_rate','mean')).reset_index()
df['year'] = pd.to_datetime(df['year'], format='%Y')
fig = px.line(df, x="year", y="tax_rate", title='Property Tax Rate', color='county_name', markers=True)
fig.show()

Essex property tax saw a huge decline since 2005 through 2011. Union property tax was on the rise since 2000 till 2015 and then it started to decline. Hudson also saw increase in property tax up until 2016 and then it went down. The rest of the counties kind of maintained same property tax across time.

In [26]:
tax_df[tax_df['year']==2022].groupby(['county_name']).agg({'tax_rate':['min','mean','max']})
Out[26]:
tax_rate
min mean max
county_name
ATLANTIC 1.01 3.01 5.45
BERGEN 0.74 2.55 4.23
BURLINGTON 1.28 3.08 4.87
CAMDEN 1.74 4.29 7.83
CAPE MAY 0.57 1.43 2.83
CUMBERLAND 2.37 3.30 5.03
ESSEX 1.94 3.14 5.96
GLOUCESTER 2.09 3.66 4.89
HUDSON 1.58 4.01 10.54
HUNTERDON 2.09 2.94 4.25
MERCER 2.43 3.36 5.46
MIDDLESEX 1.78 5.07 11.46
MONMOUTH 0.52 1.74 3.03
MORRIS 1.13 2.56 3.27
OCEAN 0.70 1.99 3.25
PASSAIC 2.44 4.10 5.71
SALEM 1.62 3.73 7.52
SOMERSET 1.33 2.37 4.08
SUSSEX 0.72 3.27 4.56
UNION 2.00 7.94 30.58
WARREN 2.31 3.52 5.87

The table above shows the lowest, average and highest property taxes by NJ counties.

Mortgage Rates¶

In [27]:
sql_query = """SELECT * FROM nj_mortgage_rates;"""
apr_df = pd.read_sql(sql_query,sqlite_connection)
apr_df.info()
2023-03-27 13:59:21,907 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_mortgage_rates;")
2023-03-27 13:59:21,909 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:21,912 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_mortgage_rates;")
2023-03-27 13:59:21,913 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:21,914 INFO sqlalchemy.engine.Engine SELECT * FROM nj_mortgage_rates;
2023-03-27 13:59:21,916 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   year       32 non-null     int64  
 1   apr_30     32 non-null     float64
 2   points_30  32 non-null     float64
 3   apr_15     32 non-null     float64
 4   points_15  32 non-null     float64
dtypes: float64(4), int64(1)
memory usage: 1.4 KB
In [105]:
# five number summary
apr_df.describe()
Out[105]:
year apr_30 points_30 apr_15 points_15
count 32.00 32.00 32.00 32.00 32.00
mean 2,006.50 5.80 0.91 5.25 0.89
std 9.38 1.75 0.47 1.86 0.46
min 1,991.00 2.96 0.43 2.27 0.45
25% 1,998.75 4.12 0.59 3.37 0.58
50% 2,006.50 5.85 0.72 5.31 0.68
75% 2,014.25 7.35 1.00 6.88 0.99
max 2,022.00 8.79 1.83 8.40 1.77
In [28]:
# trend
fig = px.line(apr_df, x="year", y = apr_df.columns[1:], title='Mortgage APR', markers=True)
fig.show()

APR for 15 years and 30 yearsd were going down steadily since 1990, it started to shoot up since 2021 onwards. Points for both 15 and 30 years mortgages declined a little bit but mostly stayed the same.

Population¶

In [29]:
sql_query = """SELECT * FROM nj_population;"""
pop_df = pd.read_sql(sql_query,sqlite_connection)
pop_df.info()
2023-03-27 13:59:22,096 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_population;")
2023-03-27 13:59:22,098 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:22,103 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_population;")
2023-03-27 13:59:22,106 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:22,108 INFO sqlalchemy.engine.Engine SELECT * FROM nj_population;
2023-03-27 13:59:22,109 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   county_name  252 non-null    object
 1   est_pop      252 non-null    int64 
 2   year         252 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 6.0+ KB
In [96]:
# five number summary
pop_df.describe()
Out[96]:
est_pop year
count 252.00 252.00
mean 423,663.90 2,015.50
std 254,065.46 3.46
min 62,341.00 2,010.00
25% 150,928.75 2,012.75
50% 448,449.00 2,015.50
75% 603,111.25 2,018.25
max 953,819.00 2,021.00
In [30]:
# trend
fig = px.line(pop_df, x="year", y = 'est_pop', color='county_name',\
              title='Population (Est) in NJ', markers=True)
fig.show()

Population counts (estimated) are increasing very slightly over the years for all the counties.

Food Desert¶

In [31]:
sql_query = """SELECT * FROM nj_food_desert;"""
food_df = pd.read_sql(sql_query,sqlite_connection)
food_df.info(verbose=True, null_counts=True)
2023-03-27 13:59:22,360 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_food_desert;")
2023-03-27 13:59:22,361 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:22,363 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_food_desert;")
2023-03-27 13:59:22,365 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:22,369 INFO sqlalchemy.engine.Engine SELECT * FROM nj_food_desert;
2023-03-27 13:59:22,370 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2002 entries, 0 to 2001
Data columns (total 148 columns):
 #    Column                Non-Null Count  Dtype  
---   ------                --------------  -----  
 0    censustract           2002 non-null   int64  
 1    state                 2002 non-null   object 
 2    county                2002 non-null   object 
 3    urban                 2002 non-null   int64  
 4    pop2010               2002 non-null   int64  
 5    ohu2010               2002 non-null   int64  
 6    groupquartersflag     2002 non-null   int64  
 7    numgqtrs              2002 non-null   float64
 8    pctgqtrs              2002 non-null   float64
 9    lilatracts_1and10     2002 non-null   int64  
 10   lilatracts_halfand10  2002 non-null   int64  
 11   lilatracts_1and20     2002 non-null   int64  
 12   lilatracts_vehicle    2002 non-null   int64  
 13   hunvflag              2002 non-null   int64  
 14   lowincometracts       2002 non-null   int64  
 15   povertyrate           2002 non-null   float64
 16   medianfamilyincome    1988 non-null   float64
 17   la1and10              2002 non-null   int64  
 18   lahalfand10           2002 non-null   int64  
 19   la1and20              2002 non-null   int64  
 20   latracts_half         2002 non-null   int64  
 21   latracts1             2002 non-null   int64  
 22   latracts10            2002 non-null   int64  
 23   latracts20            2002 non-null   int64  
 24   latractsvehicle_20    2002 non-null   int64  
 25   lapop1_10             1079 non-null   float64
 26   lapop05_10            1603 non-null   float64
 27   lapop1_20             1068 non-null   float64
 28   lalowi1_10            1079 non-null   float64
 29   lalowi05_10           1603 non-null   float64
 30   lalowi1_20            1068 non-null   float64
 31   lapophalf             1742 non-null   float64
 32   lapophalfshare        1742 non-null   float64
 33   lalowihalf            1742 non-null   float64
 34   lalowihalfshare       1742 non-null   float64
 35   lakidshalf            1742 non-null   float64
 36   lakidshalfshare       1742 non-null   float64
 37   laseniorshalf         1742 non-null   float64
 38   laseniorshalfshare    1742 non-null   float64
 39   lawhitehalf           1742 non-null   float64
 40   lawhitehalfshare      1742 non-null   float64
 41   lablackhalf           1742 non-null   float64
 42   lablackhalfshare      1742 non-null   float64
 43   laasianhalf           1742 non-null   float64
 44   laasianhalfshare      1742 non-null   float64
 45   lanhopihalf           1742 non-null   float64
 46   lanhopihalfshare      1742 non-null   float64
 47   laaianhalf            1742 non-null   float64
 48   laaianhalfshare       1742 non-null   float64
 49   laomultirhalf         1742 non-null   float64
 50   laomultirhalfshare    1742 non-null   float64
 51   lahisphalf            1742 non-null   float64
 52   lahisphalfshare       1742 non-null   float64
 53   lahunvhalf            1742 non-null   float64
 54   lahunvhalfshare       1743 non-null   float64
 55   lasnaphalf            1742 non-null   float64
 56   lasnaphalfshare       1743 non-null   float64
 57   lapop1                1218 non-null   float64
 58   lapop1share           1218 non-null   float64
 59   lalowi1               1218 non-null   float64
 60   lalowi1share          1218 non-null   float64
 61   lakids1               1218 non-null   float64
 62   lakids1share          1218 non-null   float64
 63   laseniors1            1218 non-null   float64
 64   laseniors1share       1218 non-null   float64
 65   lawhite1              1218 non-null   float64
 66   lawhite1share         1218 non-null   float64
 67   lablack1              1218 non-null   float64
 68   lablack1share         1218 non-null   float64
 69   laasian1              1218 non-null   float64
 70   laasian1share         1218 non-null   float64
 71   lanhopi1              1218 non-null   float64
 72   lanhopi1share         1218 non-null   float64
 73   laaian1               1218 non-null   float64
 74   laaian1share          1218 non-null   float64
 75   laomultir1            1218 non-null   float64
 76   laomultir1share       1218 non-null   float64
 77   lahisp1               1218 non-null   float64
 78   lahisp1share          1218 non-null   float64
 79   lahunv1               1218 non-null   float64
 80   lahunv1share          1220 non-null   float64
 81   lasnap1               1218 non-null   float64
 82   lasnap1share          1220 non-null   float64
 83   lapop10               11 non-null     float64
 84   lapop10share          11 non-null     float64
 85   lalowi10              11 non-null     float64
 86   lalowi10share         11 non-null     float64
 87   lakids10              11 non-null     float64
 88   lakids10share         11 non-null     float64
 89   laseniors10           11 non-null     float64
 90   laseniors10share      11 non-null     float64
 91   lawhite10             11 non-null     float64
 92   lawhite10share        11 non-null     float64
 93   lablack10             11 non-null     float64
 94   lablack10share        11 non-null     float64
 95   laasian10             11 non-null     float64
 96   laasian10share        11 non-null     float64
 97   lanhopi10             11 non-null     float64
 98   lanhopi10share        11 non-null     float64
 99   laaian10              11 non-null     float64
 100  laaian10share         11 non-null     float64
 101  laomultir10           11 non-null     float64
 102  laomultir10share      11 non-null     float64
 103  lahisp10              11 non-null     float64
 104  lahisp10share         11 non-null     float64
 105  lahunv10              11 non-null     float64
 106  lahunv10share         13 non-null     float64
 107  lasnap10              11 non-null     float64
 108  lasnap10share         13 non-null     float64
 109  lapop20               0 non-null      object 
 110  lapop20share          0 non-null      object 
 111  lalowi20              0 non-null      object 
 112  lalowi20share         0 non-null      object 
 113  lakids20              0 non-null      object 
 114  lakids20share         0 non-null      object 
 115  laseniors20           0 non-null      object 
 116  laseniors20share      0 non-null      object 
 117  lawhite20             0 non-null      object 
 118  lawhite20share        0 non-null      object 
 119  lablack20             0 non-null      object 
 120  lablack20share        0 non-null      object 
 121  laasian20             0 non-null      object 
 122  laasian20share        0 non-null      object 
 123  lanhopi20             0 non-null      object 
 124  lanhopi20share        0 non-null      object 
 125  laaian20              0 non-null      object 
 126  laaian20share         0 non-null      object 
 127  laomultir20           0 non-null      object 
 128  laomultir20share      0 non-null      object 
 129  lahisp20              0 non-null      object 
 130  lahisp20share         0 non-null      object 
 131  lahunv20              0 non-null      object 
 132  lahunv20share         3 non-null      float64
 133  lasnap20              0 non-null      object 
 134  lasnap20share         3 non-null      float64
 135  tractlowi             2002 non-null   float64
 136  tractkids             2002 non-null   float64
 137  tractseniors          2002 non-null   float64
 138  tractwhite            2002 non-null   float64
 139  tractblack            2002 non-null   float64
 140  tractasian            2002 non-null   float64
 141  tractnhopi            2002 non-null   float64
 142  tractaian             2002 non-null   float64
 143  tractomultir          2002 non-null   float64
 144  tracthispanic         2002 non-null   float64
 145  tracthunv             2002 non-null   float64
 146  tractsnap             2002 non-null   float64
 147  county_name           2002 non-null   object 
dtypes: float64(102), int64(19), object(27)
memory usage: 2.3+ MB
<ipython-input-31-02ebe01409f7>:3: FutureWarning:

null_counts is deprecated. Use show_counts instead

In [103]:
# five number summary
food_df.describe()
Out[103]:
censustract urban pop2010 ohu2010 groupquartersflag numgqtrs pctgqtrs lilatracts_1and10 lilatracts_halfand10 lilatracts_1and20 ... tractseniors tractwhite tractblack tractasian tractnhopi tractaian tractomultir tracthispanic tracthunv tractsnap
count 2,002.00 2,002.00 2,002.00 2,002.00 2,002.00 2,002.00 2,002.00 2,002.00 2,002.00 2,002.00 ... 2,002.00 2,002.00 2,002.00 2,002.00 2,002.00 2,002.00 2,002.00 2,002.00 2,002.00 2,002.00
mean 34,019,297,884.01 0.93 4,391.56 1,605.57 0.01 93.34 2.19 0.05 0.17 0.05 ... 592.40 3,011.61 601.81 362.50 1.52 14.50 399.61 776.80 183.70 144.89
std 11,281,186.46 0.26 1,779.84 651.96 0.07 378.36 8.72 0.23 0.38 0.22 ... 393.53 1,732.43 843.23 576.24 2.70 19.06 471.97 975.97 211.90 158.42
min 34,001,000,100.00 0.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
25% 34,009,020,725.00 1.00 3,125.50 1,138.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 343.00 1,764.75 81.00 50.00 0.00 4.00 114.00 202.25 43.00 33.00
50% 34,019,010,801.50 1.00 4,194.50 1,546.00 0.00 5.00 0.11 0.00 0.00 0.00 ... 513.50 2,887.00 236.00 151.00 0.00 8.00 223.00 388.00 108.00 88.00
75% 34,027,045,901.75 1.00 5,536.75 2,006.00 0.00 38.00 0.92 0.00 0.00 0.00 ... 744.00 4,085.25 739.00 412.00 2.00 18.00 463.75 894.25 248.00 201.00
max 34,041,032,400.00 1.00 15,765.00 5,270.00 1.00 6,451.00 100.00 1.00 1.00 1.00 ... 3,482.00 11,217.00 5,126.00 6,680.00 32.00 306.00 3,921.00 6,659.00 2,520.00 1,043.00

8 rows × 121 columns

In [32]:
# trend
df = food_df.groupby(['county_name'],as_index=False).agg({'la1and10':'sum','lahalfand10':'sum',
                                                             'la1and20':'sum'})
fig = px.bar(df, x="county_name", y = df.columns[1:],\
              title='Food desert tracts by county in NJ', barmode='group')
fig.show()

Food desert counts are mostly same for both 1 mile radius in urban areas and 10 mile radius in rural areas and 1 mile radius in urban areas and 20 mile radius in rural areas. Counts are high for half mile radius in urban areas and 10 mile radius for rural areas for Middlesex, Bergen and Monmouth.*

Area Deprivation Index¶

In [33]:
sql_query = """SELECT * FROM nj_adi;"""
adi_df = pd.read_sql(sql_query,sqlite_connection)
adi_df.info()
2023-03-27 13:59:22,814 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_adi;")
2023-03-27 13:59:22,817 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:22,821 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_adi;")
2023-03-27 13:59:22,823 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:22,825 INFO sqlalchemy.engine.Engine SELECT * FROM nj_adi;
2023-03-27 13:59:22,827 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6437 entries, 0 to 6436
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   GISJOIN       6437 non-null   object 
 1   ADI_NATRANK   6437 non-null   float64
 2   ADI_STATERNK  6437 non-null   float64
 3   FIPS          6437 non-null   object 
 4   county_code   6437 non-null   object 
 5   county_name   6437 non-null   object 
dtypes: float64(2), object(4)
memory usage: 301.9+ KB
In [102]:
# five number summary
adi_df.describe()
Out[102]:
ADI_NATRANK ADI_STATERNK
count 6,437.00 6,437.00
mean 31.45 5.50
std 20.65 2.87
min 1.00 1.00
25% 16.00 3.00
50% 28.00 5.00
75% 43.00 8.00
max 100.00 10.00
In [34]:
# trend
df = adi_df.groupby(['county_name'],as_index=False).agg({'ADI_NATRANK':'mean',
                                                             'ADI_STATERNK':'mean'})
fig = px.bar(df, x="county_name", y = df.columns[1:],\
              title='ADI Ranking by county in NJ', barmode='group')
fig.show()

Counties like Cumberland, Salem, Camde and Atlantic are ranked pretty far down the list of counties nationwide in terms of area deprivation index, which transaltes low socio economic and educational opportunities.

Distance from Major Cities¶

In [35]:
sql_query = """SELECT * FROM nj_counties_dist_to_major_cities;"""
distance_df = pd.read_sql(sql_query,sqlite_connection)
distance_df.info()
2023-03-27 13:59:23,015 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("SELECT * FROM nj_counties_dist_to_major_cities;")
2023-03-27 13:59:23,018 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:23,021 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("SELECT * FROM nj_counties_dist_to_major_cities;")
2023-03-27 13:59:23,024 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-27 13:59:23,026 INFO sqlalchemy.engine.Engine SELECT * FROM nj_counties_dist_to_major_cities;
2023-03-27 13:59:23,028 INFO sqlalchemy.engine.Engine [raw sql] ()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   county_name           21 non-null     object 
 1   county_code           21 non-null     object 
 2   dist_to_manhattan     21 non-null     float64
 3   dist_to_queens        21 non-null     float64
 4   dist_to_brooklyn      21 non-null     float64
 5   dist_to_bronx         21 non-null     float64
 6   dist_to_jersery_city  21 non-null     float64
 7   dist_to_newark        21 non-null     float64
 8   dist_to_trenton       21 non-null     float64
 9   dist_to_philadelphia  21 non-null     float64
dtypes: float64(8), object(2)
memory usage: 1.8+ KB
In [36]:
# trend
for i in distance_df.select_dtypes('float').columns:
        fig=px.bar(distance_df, x="county_name", y = i,\
              title=f'''Distance from {i.replace('dist_to_','').title()} by county in NJ''',\
                   barmode='group')
        fig.show()

The charts above shows distances from major cities by counties.

In [37]:
# close connection
sqlite_connection.close()

Combining all domains¶

In [42]:
# Aggregation at the county level
crime_df_agg = crime_df[crime_df['report_type']=='Rate Per 100,000'].drop('population',\
                                                                          axis=1).groupby(['county_name', \
                                                                                           'year'],\
                                                                             as_index=False).mean()
school_df_agg = school_df.groupby(['year', 'county_name']).agg(MeanScore=('score', 'mean')).reset_index()
tax_df_agg = tax_df.groupby(['year', 'county_name']).agg(tax_rate=('tax_rate', 'mean')).reset_index()
food_df_agg = food_df.groupby(['county_name'],as_index=False).agg({'la1and10':'sum','lahalfand10':'sum',
                                                             'la1and20':'sum'})
adi_df_agg = adi_df.groupby(['county_name'],as_index=False).agg({'ADI_NATRANK':'mean',
                                                             'ADI_STATERNK':'mean'})
In [99]:
all_df = pd.merge(crime_df_agg,school_df_agg,how='inner',on=['county_name','year'])
all_df = pd.merge(all_df,tax_df_agg,how='inner',on=['county_name','year'])
all_df = pd.merge(all_df,zhvi_df,how='inner',on=['county_name','year'])
all_df = pd.merge(all_df,zori_df,how='inner',on=['county_name','year'])
all_df = pd.merge(all_df,pop_df,how='inner',on=['county_name','year'])
all_df = pd.merge(all_df,pov_mhi_df,how='inner',on=['county_name','year'])
all_df.head()
Out[99]:
county_name year murder rape robbery assault burglary larceny auto_theft total ... num_of_bedrooms house_value_index observed_rent_index est_pop median_hh_income poverty_count poverty_rate st_abb state_code county_code
0 ATLANTIC 2017 6.50 13.73 78.73 118.58 427.38 1,695.56 81.70 2,422.18 ... 1 81,887.96 1,303.83 265446 59309 38092 14.40 NJ 034 001
1 ATLANTIC 2017 6.50 13.73 78.73 118.58 427.38 1,695.56 81.70 2,422.18 ... 2 125,858.11 1,303.83 265446 59309 38092 14.40 NJ 034 001
2 ATLANTIC 2017 6.50 13.73 78.73 118.58 427.38 1,695.56 81.70 2,422.18 ... 3 175,185.58 1,303.83 265446 59309 38092 14.40 NJ 034 001
3 ATLANTIC 2017 6.50 13.73 78.73 118.58 427.38 1,695.56 81.70 2,422.18 ... 4 245,768.02 1,303.83 265446 59309 38092 14.40 NJ 034 001
4 ATLANTIC 2017 6.50 13.73 78.73 118.58 427.38 1,695.56 81.70 2,422.18 ... 5 398,474.97 1,303.83 265446 59309 38092 14.40 NJ 034 001

5 rows × 22 columns

In [100]:
corr = all_df.corr()
fig = px.imshow(corr, aspect="auto",height=1000, title ='Correlation heatmap for Historical data by Plotly')
fig.show()

House value index has strong positive correlation with number of bedrooms and has weak positive correlation with school score, rent index, population and median household income. It has weak negative correlation with poverty rate but weak positive correlation with poverty count.

Just 2020¶

In [46]:
#  joinig all 2020 rows
all_2020_df=pd.merge(crime_df_agg[crime_df_agg['year']==2020], \
                  pov_mhi_df[pov_mhi_df['year']==2020].drop(['st_abb', 'state_code', 'county_code'],axis=1),\
                  how='inner', on = ['county_name', 'year']).drop('year',axis=1)
all_2020_df=pd.merge(all_2020_df, school_df_agg[school_df_agg['year']==2020].drop('year',axis=1), how='inner',\
                  on = 'county_name')
all_2020_df=pd.merge(all_2020_df, zhvi_df[zhvi_df['year']==2020].drop('year',\
                                            axis=1).groupby('county_name',\
                                            as_index=False).agg({'house_value_index':'mean'}),\
                how='inner',   on = 'county_name')
all_2020_df=pd.merge(all_2020_df, zori_df[zori_df['year']==2020].drop('year',axis=1),\
                  how='inner', on='county_name')
all_2020_df=pd.merge(all_2020_df,tax_df_agg[tax_df_agg['year']==2020].drop('year',axis=1),\
               how='inner', on='county_name')
all_2020_df=pd.merge(all_2020_df,pop_df[pop_df['year']==2020].drop('year',axis=1),\
               how='inner', on='county_name')
all_2020_df=pd.merge(all_2020_df,food_df_agg, how='inner', on='county_name')
all_2020_df=pd.merge(all_2020_df,adi_df_agg, how='inner', on='county_name')
all_2020_df=pd.merge(all_2020_df,distance_df, how='inner', on='county_name')
all_2020_df
Out[46]:
county_name murder rape robbery assault burglary larceny auto_theft total median_hh_income ... ADI_STATERNK county_code dist_to_manhattan dist_to_queens dist_to_brooklyn dist_to_bronx dist_to_jersery_city dist_to_newark dist_to_trenton dist_to_philadelphia
0 ATLANTIC 3.00 12.68 41.55 95.54 274.96 1,287.86 81.83 1,797.43 61377 ... 8.03 001 51.68 62.88 52.48 61.56 49.36 41.69 14.77 35.00
1 BERGEN 0.41 4.33 10.48 29.65 70.74 2,243.08 94.14 2,452.84 107114 ... 3.42 003 6.66 17.59 7.74 14.58 5.08 8.16 49.47 77.68
2 BURLINGTON 2.56 17.19 18.47 98.75 117.88 720.39 56.77 1,032.01 92471 ... 6.66 005 48.51 60.21 49.41 58.52 46.04 37.76 8.89 34.31
3 CAMDEN 1.71 8.70 50.15 142.79 230.76 1,551.97 96.20 2,082.29 71718 ... 8.26 007 68.29 80.19 69.24 78.31 65.76 57.13 15.81 14.27
4 CUMBERLAND 3.85 18.35 87.75 231.95 286.87 1,164.85 78.72 1,872.30 60352 ... 9.34 011 80.51 92.24 81.43 90.53 78.03 69.57 28.73 10.91
5 ESSEX 2.11 8.33 31.91 53.10 96.19 564.39 141.56 897.59 65528 ... 5.76 013 16.81 29.00 17.93 26.46 14.19 5.20 37.03 65.25
6 GLOUCESTER 2.30 2.87 21.88 68.83 152.47 825.12 57.14 1,130.61 87220 ... 7.39 015 80.88 92.83 81.85 90.90 78.33 69.63 28.00 4.41
7 HUDSON 0.56 9.37 36.61 81.19 83.30 696.22 71.81 979.08 77323 ... 4.76 017 4.75 16.94 5.88 14.56 2.14 6.94 48.51 76.84
8 HUNTERDON 1.24 0.78 0.20 11.95 58.86 294.44 22.16 389.63 113611 ... 4.00 019 62.91 75.10 63.97 72.72 60.29 51.27 12.35 20.54
9 MERCER 2.20 7.93 19.97 56.78 61.80 329.56 35.63 513.86 87581 ... 6.46 021 48.91 60.99 49.92 58.89 46.33 37.52 4.19 32.50
10 MIDDLESEX 1.02 6.07 14.75 62.93 80.65 570.99 49.09 785.51 95610 ... 5.29 023 28.57 40.64 29.57 38.57 26.00 17.32 24.50 52.84
11 MONMOUTH 0.12 11.56 17.81 67.53 133.32 993.27 109.26 1,332.86 104219 ... 3.97 025 17.28 28.53 18.04 27.10 15.10 9.51 37.28 65.45
12 MORRIS 0.44 4.73 6.26 29.35 80.26 484.50 57.23 662.77 114103 ... 3.43 027 37.43 49.62 38.55 47.01 34.81 25.78 19.04 45.92
13 OCEAN 0.44 8.53 19.43 84.27 129.56 1,010.39 97.07 1,349.69 78181 ... 6.08 029 25.30 35.83 25.93 34.81 23.28 17.69 32.58 60.01
14 PASSAIC 1.20 5.97 31.24 80.28 81.72 599.26 67.47 867.17 64422 ... 6.16 031 21.34 33.23 22.50 30.35 18.82 10.48 35.20 62.91
15 SALEM 30.65 34.56 80.32 180.04 315.64 799.85 111.89 1,552.90 65563 ... 8.71 033 96.06 108.03 97.04 106.07 93.51 84.76 43.08 15.12
16 SOMERSET 0.30 4.56 9.13 12.22 84.49 630.63 62.93 804.28 115573 ... 4.07 035 42.42 54.60 43.48 52.28 39.80 30.82 11.78 39.44
17 SUSSEX 1.41 8.24 0.91 28.61 47.80 262.64 17.12 366.71 92739 ... 6.60 037 48.15 60.23 49.29 57.42 45.55 36.61 17.51 39.09
18 UNION 1.79 9.50 27.22 56.83 81.37 604.69 99.97 881.40 83189 ... 5.48 039 21.07 33.27 22.14 30.94 18.46 9.48 32.25 60.53
19 WARREN 0.00 7.19 7.71 32.80 160.09 894.20 32.66 1,134.63 80412 ... 7.27 041 68.78 80.98 69.88 78.41 66.16 57.11 20.14 19.81

20 rows × 31 columns

In [47]:
corr = all_2020_df.corr()
fig = px.imshow(corr, aspect="auto",height=1000, title ='Correlation heatmap for 2020 data by Plotly')
fig.show()

House Price index have strong negative correlation with distance to all major cities except Trenton and Philadelphia as well as Area Deprivation Index (ADI) - both national and state rankings and burglary.House Price Index has strong positive correlation with Observed Rent Index and moderately strong positive correlation with population, median household income, school scores. House Price Index also have somewhat of a weak to mdoerate positive correlation to food desert count at half mile raidus.

Pvoerty rate has strong negative correlation with school score and median household income and moderately strong positive correlation different violent crimes. We will use poverty rate as a social determinant factor for our predictions.

Pandas Profiling¶

In [94]:
df_list= [crime_df, pov_mhi_df, school_df, zhvi_df, zori_df, tax_df, apr_df, pop_df,\
          food_df, adi_df, distance_df]
df_names=["crime_df", "pov_mhi_df", "school_df", "zhvi_df", "zori_df", "tax_df", "apr_df", "pop_df",\
          "food_df", "adi_df", "distance_df"]
In [95]:
# generate profile report
for i,j in zip(df_list,df_names):
    profile = ProfileReport(i, title=f"{j.replace('_df','').title()} Report",\
                       vars={"num": {"low_categorical_threshold": 0}})
    profile.to_file(f"../Documents/profiling_report_{j.replace('_df','').title()}.html")
    print(f"Report generated for {j.replace('_df','').title()}!")
Report generated for Crime!
Report generated for Pov_Mhi!
Report generated for School!
Report generated for Zhvi!
Report generated for Zori!
Report generated for Tax!
Report generated for Apr!
Report generated for Pop!
C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\missingno\missingno.py:250: UserWarning:

FixedFormatter should only be used together with FixedLocator

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\missing.py:89: UserWarning:

There was an attempt to generate the Count missing values diagrams, but this failed.
To hide this warning, disable the calculation
(using `df.profile_report(missing_diagrams={"Count": False}`)
If this is problematic for your use case, please report this as an issue:
https://github.com/pandas-profiling/pandas-profiling/issues
(include the error message: 'The number of FixedLocator locations (7), usually from a call to set_ticks, does not match the number of ticklabels (148).')

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

C:\Users\mosab\AppData\Roaming\Python\Python38\site-packages\pandas_profiling\model\pandas\duplicates_pandas.py:37: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

Report generated for Food!
Report generated for Adi!
Report generated for Distance!